Fecal Microbiota, Fecal Metabolome, and Colorectal Cancer Interrelations

Chien-Hung Lu

2024-08-17

Background

# Check the first 5 rows of data
head(demo)
##        ID case age      bmi sex race hosp1 hosp2 PALMITOYL_SPHINGOMYELIN
## 1 CRC.222    1  57 24.56060   1    0     0     0               2.1597335
## 2  CRC.57    1  84 24.40972   1    0     0     0               0.7875869
## 3  CRC.82    1  53 25.39408   0    0     0     0               0.1100559
## 4 CRC.220    1  67 33.19722   0    0     0     0              -0.1140084
## 5  CRC.90    1  75 19.96686   1    0     0     0               0.9801817
## 6 CRC.180    1  51 25.82185   0    0     1     0              -0.3014311
##    MANDELATE P_HYDROXYBENZALDEHYDE ALPHA_TOCOPHEROL GAMMA_TOCOPHEROL SITOSTANOL
## 1  1.6465297            0.08340372       -0.5776607       -1.7603930 -1.1044717
## 2 -2.2457098           -0.21173592        0.5475180        0.2218826  0.9375046
## 3  0.9128680            0.46993266       -1.3736785       -1.2004098  0.2241344
## 4  0.4427066            0.35402359        0.3992775        1.0339758 -1.4640268
## 5  1.0541146            0.74434868        0.5373425        0.1990403  0.8703116
## 6  0.4081363            0.16390999       -0.5725462        0.4573616  0.5149290
##   X_3_DEHYDROCARNITINE      PTERIN CONJUGATED_LINOLEATE__18_2N7
## 1           1.72269041 -0.06827190                    0.9405190
## 2           0.73262197 -0.16149973                   -0.3450236
## 3           0.03498601 -1.01269861                   -0.8189785
## 4          -3.43154403 -0.06672365                   -0.8819016
## 5          -0.07164571  0.01408302                   -0.3953074
## 6           0.09810451  0.13279694                    0.4864226
##   N__2_FUROYL_GLYCINE P_AMINOBENZOATE__PABA_ c_Clostridia f_Lachnospiraceae
## 1          -1.0611150            -1.27250375   -2.1446811        -1.2179051
## 2           1.1174587             0.64137522   -0.8134043        -1.1171948
## 3           0.1007269            -0.57635035   -1.4529813        -1.2388488
## 4          -0.4619462            -0.34420501    1.2366855        -0.1024485
## 5           0.2634145             0.03675639   -1.5746386        -1.3293538
## 6           0.2336798            -0.12898916   -1.5157212        -0.9967520
##   p_Fusobacteria g_Fusobacterium g_Atopobium g_Porphyromonas
## 1              1               1           0               0
## 2              0               0           0               0
## 3              0               0           0               0
## 4              0               0           0               0
## 5              0               0           0               1
## 6              0               0           0               1
# Subsetting the first 8 columns as demographics data
demo <- demo[,c(1:8)]

Data cleaning and refactor the values

# Factor and relabel the numeric data
demo$case <- factor(demo$case, labels = c('patient','control'))
demo$sex <- factor(demo$sex, labels = c('male','female'))
demo$race <- factor(demo$race, labels = c('white','black and others'))

# Create a new column for hospital and factorize
demo$hospital <- ifelse(demo$hosp1 == 1, "A", ifelse(demo$hosp2 == 1, "B", "C"))
demo$hospital <- factor(demo$hospital, labels = c('Walter Reed Army Medical Center', 'George Washington University Hospital', 'National Naval Medical Center'))

# Delete the hosp1 and hosp2 columns
demo <- demo[,c(-7,-8)]

# Have a summary of data
summary(demo)
##       ID                 case         age             bmi            sex    
##  Length:131         patient:89   Min.   :25.00   Min.   :16.83   male  :81  
##  Class :character   control:42   1st Qu.:53.00   1st Qu.:22.96   female:50  
##  Mode  :character                Median :63.00   Median :24.56              
##                                  Mean   :59.99   Mean   :25.60              
##                                  3rd Qu.:68.50   3rd Qu.:27.94              
##                                  Max.   :89.00   Max.   :40.19              
##                race                                      hospital 
##  white           :107   Walter Reed Army Medical Center      :54  
##  black and others: 24   George Washington University Hospital:26  
##                         National Naval Medical Center        :51  
##                                                                   
##                                                                   
## 

Next, we split the dataset into patient and control group.

# Use filter method to split data, case 0 refers to patient group, case 1 refers to control group
demo_pat <- demo %>% filter(case == "patient")
  
demo_cont <-demo %>% filter(case == "control")

Visulization of data

par(mfrow=c(1, 2))
hist(demo$age, col = "steelblue")
hist(demo$bmi, col = "steelblue")

Most of our data lie on the QQline, indicating that the data is normally distributed.

par(mfrow=c(1,2))
##QQPlot for age
qqnorm(demo$age, main = "Normal Q-Q Plot Age")

#QQLine adds a straight line for reference
qqline(demo$age)

##QQPlot for BMI
qqnorm(demo$bmi,main= "Normal Q-Q Plot BMI")

#QQLine adds a straight line for reference
qqline(demo$bmi)

Use Shapiro test to check if age and bmi are normally distributed

shapiro.test(demo$age)
## 
##  Shapiro-Wilk normality test
## 
## data:  demo$age
## W = 0.98093, p-value = 0.06247
shapiro.test(demo$bmi)
## 
##  Shapiro-Wilk normality test
## 
## data:  demo$bmi
## W = 0.95522, p-value = 0.0002748

Comparing the difference between patients with CRC and control group

# Compare sex difference between patient and control groups
chisq.test(demo$sex, demo$case)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  demo$sex and demo$case
## X-squared = 0.032729, df = 1, p-value = 0.8564
# Compare sex difference between patient and control groups
chisq.test(demo$race, demo$case)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  demo$race and demo$case
## X-squared = 1.8431, df = 1, p-value = 0.1746
# Compare hospital difference between patient and control groups
chisq.test(demo$hospital, demo$case)
## 
##  Pearson's Chi-squared test
## 
## data:  demo$hospital and demo$case
## X-squared = 1.9685, df = 2, p-value = 0.3737
# Compare age difference between patient and control groups
t.test(demo$age ~ demo$case)
## 
##  Welch Two Sample t-test
## 
## data:  demo$age by demo$case
## t = -2.0612, df = 79.797, p-value = 0.04254
## alternative hypothesis: true difference in means between group patient and group control is not equal to 0
## 95 percent confidence interval:
##  -9.9411459 -0.1744239
## sample estimates:
## mean in group patient mean in group control 
##              58.37079              63.42857
# Compare BMI difference between patient and control groups
t.test(demo$bmi ~ demo$case)
## 
##  Welch Two Sample t-test
## 
## data:  demo$bmi by demo$case
## t = 0.20893, df = 88.087, p-value = 0.835
## alternative hypothesis: true difference in means between group patient and group control is not equal to 0
## 95 percent confidence interval:
##  -1.362230  1.682314
## sample estimates:
## mean in group patient mean in group control 
##              25.64960              25.48956

Use left_join to combine demographics, microbiota data, and metabolites data

demo_mb  <- left_join(demo, mb, by = "ID")
demo_mb_mt  <- left_join(demo_mb, mt, by = "ID")

par(mfrow=c(1,2))

pic <- ggplot(demo_mb_mt, aes(x = Root.k__Archaea, y = X_1_11_UNDECANEDICARBOXYLATE, col = as.factor(case)))

pic + geom_point()

pic2 <- ggplot(demo_mb_mt, aes(x = Root.k__Bacteria, y = X_1_2_PROPANEDIOL, col = as.factor(case)))

pic2 + geom_point()

Check the correlation between fecal microbiota-microbiota and microbiota-metabolite in CRC and control groups

# Use filter method to split data to patient group and control group
demo_mb_mt_pat <- demo_mb_mt %>% filter(case == "patient")
demo_mb_mt_cont <-demo_mb_mt %>% filter(case == "control")

demo_mb_mt_pat <- demo_mb_mt_pat[,c(8:757)]
demo_mb_mt_cont <- demo_mb_mt_cont[,c(8:757)]

par(mfrow=c(1,2))
Heatmap(cor(demo_mb_mt_pat), show_column_names = FALSE, show_row_names = FALSE)

Heatmap(cor(demo_mb_mt_cont), show_column_names = FALSE, show_row_names = FALSE)

Use which() function to find out the most significant microbiota and metabolites pair, and see if there is group difference

h1 <- cor(demo_mb_mt_pat)
h2 <- cor(demo_mb_mt_cont)

h1_which <- which(((1>h1*h1) & (h1*h1>0.9)), arr.ind=T)
h2_which <- which(((1>h2*h2) & (h2*h2>0.9)), arr.ind=T)

# Find out most significant correlation pair in patient group
head(h1_which)
##                                  row col
## Root.k__Bacteria                   2   1
## Root.k__Archaea.p__Euryarchaeota   3   1
## Root.k__Archaea                    1   2
## Root.k__Archaea.p__Euryarchaeota   3   2
## Root.k__Archaea                    1   3
## Root.k__Bacteria                   2   3
# Find out most significant correlation pair in control group
head(h2_which)
##                                                                           row
## Root.k__Bacteria.p__.c__                                                   18
## Root.k__Bacteria.p__.c__.o__                                               39
## Root.k__Bacteria.p__.c__.o__.f__                                           75
## Root.k__Bacteria.p__.c__.o__.f__.g__                                      130
## Root.k__Bacteria.p__Actinobacteria.c__Actinobacteria                       19
## Root.k__Bacteria.p__Actinobacteria.c__Actinobacteria.o__Bifidobacteriales  41
##                                                                           col
## Root.k__Bacteria.p__.c__                                                    4
## Root.k__Bacteria.p__.c__.o__                                                4
## Root.k__Bacteria.p__.c__.o__.f__                                            4
## Root.k__Bacteria.p__.c__.o__.f__.g__                                        4
## Root.k__Bacteria.p__Actinobacteria.c__Actinobacteria                        6
## Root.k__Bacteria.p__Actinobacteria.c__Actinobacteria.o__Bifidobacteriales   6

Conclusion

Discussion and future direction